Importing libraries and data¶

In [1]:
#Importing relevant libraries
import polars as pl
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skewnorm, norm
import warnings
warnings.simplefilter(action='ignore')

from darts import TimeSeries
from darts.models import *
from darts.dataprocessing.transformers import Scaler
from darts.metrics import *
from darts.utils.statistics import check_seasonality
In [2]:
# Load the data
df = pl.read_csv("SnP500.csv")
In [3]:
# Data Summary
df.describe()
Out[3]:
shape: (9, 10)
statisticDateCloseVolumeInflationUnemploymentGDP_Growth_RateGDPInterest_rateCPI
strstrf64f64f64f64f64f64f64f64
"count""7555"7555.07555.07555.07555.07555.07555.07555.07555.0
"null_count""0"0.00.00.00.00.00.00.00.0
"mean"null1616.1116032.6369e92.151765.7525482.4638431.4420e133.3176142.489871
"std"null977.0934221.7951e91.1680991.6278491.8249535.0062e122.1328031.441926
"min""1/10/1993"429.0499881.499e70.6409553.65-2.7678036.8600e12-1.189357-0.355546
"25%"null1029.0300299.81e81.5585314.621.8418751.0300e132.0238851.622223
"50%"null1301.3499762.8136e91.899615.452.706371.4500e132.9605062.33769
"75%"null2050.6298833.8921e92.370346.173.7725651.8200e134.898312.951657
"max""9/9/2022"4796.5600591.1456e107.0052769.635.9454852.5500e137.1481788.0028
In [4]:
# Data Peek
df.head()
Out[4]:
shape: (5, 9)
DateCloseVolumeInflationUnemploymentGDP_Growth_RateGDPInterest_rateCPI
strf64i64f64f64f64f64f64f64
"4/1/1993"435.3800052012100002.370346.92.7517816.8600e123.5456172.951657
"5/1/1993"434.3399962403500002.370346.92.7517816.8600e123.5456172.951657
"6/1/1993"434.5199892952400002.370346.92.7517816.8600e123.5456172.951657
"7/1/1993"430.7300113048500002.370346.92.7517816.8600e123.5456172.951657
"8/1/1993"429.0499882634700002.370346.92.7517816.8600e123.5456172.951657

Data Preprocessing¶

In [5]:
#Checking on the date patterns
df["Date"].value_counts
Out[5]:
<bound method Series.value_counts of shape: (7_555,)
Series: 'Date' [str]
[
	"4/1/1993"
	"5/1/1993"
	"6/1/1993"
	"7/1/1993"
	"8/1/1993"
	…
	"23-12-2022"
	"27-12-2022"
	"28-12-2022"
	"29-12-2022"
	"30-12-2022"
]>
In [6]:
# Detect all unique delimiters in date strings as we see some different patterns w.r.t to delimiters
delimiters = {char for date in df["Date"] for char in date if not char.isdigit()}
delimiters
Out[6]:
{'-', '/'}
In [7]:
# Function to normalize date formats -- Strictly w.r.t this data
def normalize_date(date_str):
    for fmt in ("%d-%m-%Y", "%d/%m/%Y"):
        try:
            return datetime.strptime(date_str, fmt).strftime("%d-%m-%Y")
        except ValueError:
            continue
    raise ValueError(f"Date format for {date_str} not recognized")
Kindly note the data imported is recorded on daily basis but since we need to have quarterly prediction, converting the dataset quarterly aggregating by mean.¶
In [8]:
# Common method to preprocess the DataFrame as per the current data used
def process_dataframe(df):
    # Apply the normalization function to the Date column
    df1 = df.with_columns(pl.col("Date").map_elements(normalize_date))

    # Convert the 'Date' column to datetime
    df1 = df1.with_columns(pl.col("Date").str.strptime(pl.Date))

    # Extract year and quarter from the date and create 'ds' column
    df1 = df1.with_columns((pl.col("Date").dt.year().cast(str) + 'Q' + pl.col("Date").dt.quarter().cast(str)).alias("ds"))

    # Group by 'ds' and calculate the mean for each group for all columns except 'ds'
    agg_exprs = [pl.col(col).mean().alias(f"meanQ_{col}") for col in df1.columns if col != "ds"]
    quarterly_df = df1.groupby("ds").agg(agg_exprs)

    # Convert to Pandas DataFrame
    quarterly_df_pandas = quarterly_df.to_pandas()

    # Convert 'ds' to datetime and set as index
    quarterly_df_pandas['ds'] = pd.to_datetime(quarterly_df_pandas['ds'])
    quarterly_df_pandas = quarterly_df_pandas.set_index('ds').sort_index()
    # Drop the 'mean_Date' column
    quarterly_df_pandas = quarterly_df_pandas.drop(columns=["meanQ_Date"])

    return quarterly_df_pandas

# Process the DataFrame
data = process_dataframe(df)
data
Out[8]:
meanQ_Close meanQ_Volume meanQ_Inflation meanQ_Unemployment meanQ_GDP_Growth_Rate meanQ_GDP meanQ_Interest_rate meanQ_CPI
ds
1993-01-01 442.750321 2.659718e+08 2.370340 6.90 2.751781 6.860000e+12 3.545617 2.951657
1993-04-01 445.505872 2.620033e+08 2.370340 6.90 2.751781 6.860000e+12 3.545617 2.951657
1993-07-01 453.558748 2.557414e+08 2.370340 6.90 2.751781 6.860000e+12 3.545617 2.951657
1993-10-01 464.271874 2.751294e+08 2.370340 6.90 2.751781 6.860000e+12 3.545617 2.951657
1994-01-01 469.213492 3.127857e+08 2.135424 6.12 4.028793 7.290000e+12 4.898310 2.607442
... ... ... ... ... ... ... ... ...
2021-10-01 4602.108894 4.082385e+09 4.492792 5.35 5.945485 2.330000e+13 -1.189357 4.697859
2022-01-01 4463.855477 5.028659e+09 7.005276 3.65 2.061593 2.550000e+13 0.000000 8.002800
2022-04-01 4105.667102 4.924918e+09 7.005276 3.65 2.061593 2.550000e+13 0.000000 8.002800
2022-07-01 3980.351112 4.190339e+09 7.005276 3.65 2.061593 2.550000e+13 0.000000 8.002800
2022-10-01 3851.973501 4.345159e+09 7.005276 3.65 2.061593 2.550000e+13 0.000000 8.002800

120 rows × 8 columns

In [9]:
data.columns
Out[9]:
Index(['meanQ_Close', 'meanQ_Volume', 'meanQ_Inflation', 'meanQ_Unemployment',
       'meanQ_GDP_Growth_Rate', 'meanQ_GDP', 'meanQ_Interest_rate',
       'meanQ_CPI'],
      dtype='object')
In [10]:
# Plot using Plotly -- kindly select the variables individually in the right legend to have a better visualization.
fig = px.line(data, x= data.index,
              y=data.columns, title='Time Series Data')
fig.show()
  1. The mean quarterly close price has shown a general upward trend over time, with some fluctuations, highest observed in Oct 2021.
  2. The mean quarterly volume has also shown an upward trend, indicating increased trading activity over time with peak in first Quarter in 2009.
  3. The mean quarterly inflation, unemployment and GDP_Growth_Rate has varied over time, with noticeable peaks and troughs.
  4. The mean quarterly GDP has generally increased over time, reflecting economic growth.
  5. The mean quarterly interest rate has shown significant fluctuations, may reflecting changes in monetary policy.
  6. The mean quarterly CPI has sudden an upward trend post 2019, indicating rising consumer prices.

Dealing with Outliers and Multicollinearity¶

In [11]:
#Distplot and Boxplot for each feature - Data distribution and Outlier Detection
plt.figure(figsize=[20,60])
columns = data.columns
cnt = 1
for col in columns:
    plt.subplot(14, 2, cnt)
    sns.distplot(data[col], fit=norm)
    cnt += 1
    plt.subplot(14, 2, cnt)
    sns.boxplot(data[col])
    cnt += 1
plt.tight_layout()
plt.show()

1) Features doesn't seem to align close with normal distribution except for CPI. 2) Close, Inflation, Unemployment, GDP Growth Rate and CPI are having outliers.

In [12]:
# Outlier Treatment - Values below the lower bound are replaced with the lower bound, 
# and values above the upper bound are replaced with the upper bound.

# Function to detect and treat outliers using IQR method
def treat_outliers(df, column):
    Q1 = df[column].quantile(0.25)  # Q1 is the 25th percentile, and                                  
    Q3 = df[column].quantile(0.75)  # Q3 is the 75th percentile of the data.
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR  
    upper_bound = Q3 + 1.5 * IQR  
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    return df

# List of columns to treat for outliers
columns_to_treat = ['meanQ_Close', 'meanQ_Inflation', 'meanQ_Unemployment', 'meanQ_GDP_Growth_Rate', 'meanQ_CPI']

# Apply outlier treatment to each column
for column in columns_to_treat:
    data_treated = treat_outliers(data, column)
In [13]:
# Calculate the correlation matrix
correlation_matrix = data_treated.corr()

# Create a heatmap
fig_corr = go.Figure(data=go.Heatmap(
                   z=correlation_matrix.values,
                   x=correlation_matrix.columns,
                   y=correlation_matrix.columns,
                   colorscale='Viridis'))

fig_corr.update_layout(title='Correlation Heatmap')
fig_corr.show()
We observe that multicollinearity exists between few features, let's dig in further with the help of VIF¶
In [14]:
# Drop the target variable
independent_variables = data_treated.drop(columns=['meanQ_Unemployment'])
# Calculate VIF for each independent variable
# A high VIF value (typically greater than 10) indicates that the variance 
# of the coefficient estimate for that variable is inflated due to multicollinearity.
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(independent_variables.values, i) for i in range(independent_variables.shape[1])]
vif["features"] = independent_variables.columns
vif
Out[14]:
VIF Factor features
0 8.460349 meanQ_Close
1 8.136021 meanQ_Volume
2 5.882536 meanQ_Inflation
3 1.787589 meanQ_GDP_Growth_Rate
4 158.537938 meanQ_GDP
5 1.025189 meanQ_Interest_rate
6 5.205215 meanQ_CPI

We observe GDP is having excessively high vif value, will eliminate this as it might create noise in the model. Besides, we also observe Close and Volume being on higher and similar vif value, if it would have been greater than 10(threshold), then would have considered only one of them in the model; same goes with CPI and Inflation.

Model Building and Validation¶

Kindly note this model building is done for multivariate time series analysis where target variable is Unemployment and features are rest macroeconomic parameters.

In [19]:
# Function to create TimeSeries objects, scale data, split into train/test, fit models, and calculate metrics
def evaluate_models(data, target_col, feature_cols, models):
    # Create TimeSeries object with all features
    target_series = TimeSeries.from_dataframe(data, value_cols=target_col)
    feature_series = TimeSeries.from_dataframe(data, value_cols=feature_cols)

    # Ensure past covariates start from the earliest possible date
    feature_series = feature_series.slice_intersect(target_series)

    # Scale the data
    scaler = Scaler()
    target_scaled_series = scaler.fit_transform(target_series)
    feature_scaled_series = scaler.fit_transform(feature_series)

    # Split the data into train and test sets
    train_target, test_target = target_scaled_series.split_before(0.8)
    train_features, test_features = feature_scaled_series.split_before(0.8)

    metrics = {
        'Model': [],
        'sMAPE': [],
        'RMSE': []
    }

    predictions = {}
    
    for model_name, model in models.items():
        # Fit model
        model.fit(train_target, past_covariates=train_features)

        # Make predictions
        pred = model.predict(len(test_target))
        
        # Store predictions
        predictions[model_name] = pred

        # Calculate validation metrics
        metrics['Model'].append(model_name)
        metrics['sMAPE'].append(smape(test_target, pred))
        metrics['RMSE'].append(rmse(test_target, pred))

    # Create a DataFrame for the metrics
    metrics_df = pd.DataFrame(metrics)
    return metrics_df, predictions, target_scaled_series

# Define models
models = {
    'NHiTS': NHiTSModel(input_chunk_length=48, output_chunk_length=36),
    'TiDE': TiDEModel(input_chunk_length=48, output_chunk_length=36),
    'NBEATS': NBEATSModel(input_chunk_length=48, output_chunk_length=36)
}
In [16]:
# Evaluate models
metrics_df1, predictions1, target_scaled_series1 = evaluate_models(data_treated, 'meanQ_Unemployment', 
                             ['meanQ_Close', 'meanQ_Volume', 'meanQ_Inflation', 'meanQ_GDP_Growth_Rate', 'meanQ_Interest_rate', 'meanQ_CPI'], 
                             models)
metrics_df1
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name            | Type             | Params | Mode 
-------------------------------------------------------------
0 | criterion       | MSELoss          | 0      | train
1 | train_criterion | MSELoss          | 0      | train
2 | val_criterion   | MSELoss          | 0      | train
3 | train_metrics   | MetricCollection | 0      | train
4 | val_metrics     | MetricCollection | 0      | train
5 | stacks          | ModuleList       | 1.4 M  | train
-------------------------------------------------------------
1.2 M     Trainable params
172 K     Non-trainable params
1.4 M     Total params
5.621     Total estimated model params size (MB)
Training: |                                                                                      | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting: |                                                                                    | 0/? [00:00<…
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name                | Type             | Params | Mode 
-----------------------------------------------------------------
0 | criterion           | MSELoss          | 0      | train
1 | train_criterion     | MSELoss          | 0      | train
2 | val_criterion       | MSELoss          | 0      | train
3 | train_metrics       | MetricCollection | 0      | train
4 | val_metrics         | MetricCollection | 0      | train
5 | past_cov_projection | _ResidualBlock   | 1.4 K  | train
6 | encoders            | Sequential       | 78.2 K | train
7 | decoders            | Sequential       | 165 K  | train
8 | temporal_decoder    | _ResidualBlock   | 594    | train
9 | lookback_skip       | Linear           | 1.8 K  | train
-----------------------------------------------------------------
247 K     Trainable params
0         Non-trainable params
247 K     Total params
0.989     Total estimated model params size (MB)
Training: |                                                                                      | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting: |                                                                                    | 0/? [00:00<…
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name            | Type             | Params | Mode 
-------------------------------------------------------------
0 | criterion       | MSELoss          | 0      | train
1 | train_criterion | MSELoss          | 0      | train
2 | val_criterion   | MSELoss          | 0      | train
3 | train_metrics   | MetricCollection | 0      | train
4 | val_metrics     | MetricCollection | 0      | train
5 | stacks          | ModuleList       | 8.7 M  | train
-------------------------------------------------------------
8.7 M     Trainable params
3.3 K     Non-trainable params
8.7 M     Total params
34.770    Total estimated model params size (MB)
Training: |                                                                                      | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting: |                                                                                    | 0/? [00:00<…
Out[16]:
Model sMAPE RMSE
0 NHiTS 113.325883 0.502218
1 TiDE 128.925258 0.821985
2 NBEATS 118.022760 0.559883
In [17]:
# Convert TimeSeries to pandas DataFrame for plotting
actual_scaled_df1 = target_scaled_series1.pd_dataframe()
predicted_df1 = predictions1['NHiTS'].pd_dataframe() #Chosen NHiTS as it has minm RMSE score and is the best model among rest.

# Create traces
fig = go.Figure()

# Add actual data trace
for col in actual_scaled_df1.columns:
    fig.add_trace(go.Scatter(x=actual_scaled_df1.index, y=actual_scaled_df1[col], mode='lines', name=f'Actual {col}'))

# Add predicted data trace
for col in predicted_df1.columns:
    fig.add_trace(go.Scatter(x=predicted_df1.index, y=predicted_df1[col], mode='lines', name=f'Predicted {col}'))

# Update layout
fig.update_layout(
    title="Actual vs Predicted Time Series Analysis - Treated",
    xaxis_title="Date",
    yaxis_title="Value",
    legend_title="Legend",
    width=1000,
    height=600
)

fig.show()
Above results is when treated outliers and multicollinearlity, lets check if its done on the original data¶
In [20]:
# Rerun "evaluate_models" method again before running this cell otherwise it will error out.
# Evaluate models
metrics_df2, predictions2, target_scaled_series2 = evaluate_models(data, 'meanQ_Unemployment', 
                             ['meanQ_Close', 'meanQ_Volume', 'meanQ_Inflation', 'meanQ_GDP_Growth_Rate','meanQ_GDP', 'meanQ_Interest_rate', 'meanQ_CPI'],
                             models)
metrics_df2
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name            | Type             | Params | Mode 
-------------------------------------------------------------
0 | criterion       | MSELoss          | 0      | train
1 | train_criterion | MSELoss          | 0      | train
2 | val_criterion   | MSELoss          | 0      | train
3 | train_metrics   | MetricCollection | 0      | train
4 | val_metrics     | MetricCollection | 0      | train
5 | stacks          | ModuleList       | 1.5 M  | train
-------------------------------------------------------------
1.3 M     Trainable params
196 K     Non-trainable params
1.5 M     Total params
5.973     Total estimated model params size (MB)
Training: |                                                                                      | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting: |                                                                                    | 0/? [00:00<…
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name                | Type             | Params | Mode 
-----------------------------------------------------------------
0 | criterion           | MSELoss          | 0      | train
1 | train_criterion     | MSELoss          | 0      | train
2 | val_criterion       | MSELoss          | 0      | train
3 | train_metrics       | MetricCollection | 0      | train
4 | val_metrics         | MetricCollection | 0      | train
5 | past_cov_projection | _ResidualBlock   | 1.6 K  | train
6 | encoders            | Sequential       | 78.2 K | train
7 | decoders            | Sequential       | 165 K  | train
8 | temporal_decoder    | _ResidualBlock   | 594    | train
9 | lookback_skip       | Linear           | 1.8 K  | train
-----------------------------------------------------------------
247 K     Trainable params
0         Non-trainable params
247 K     Total params
0.989     Total estimated model params size (MB)
Training: |                                                                                      | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting: |                                                                                    | 0/? [00:00<…
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name            | Type             | Params | Mode 
-------------------------------------------------------------
0 | criterion       | MSELoss          | 0      | train
1 | train_criterion | MSELoss          | 0      | train
2 | val_criterion   | MSELoss          | 0      | train
3 | train_metrics   | MetricCollection | 0      | train
4 | val_metrics     | MetricCollection | 0      | train
5 | stacks          | ModuleList       | 9.1 M  | train
-------------------------------------------------------------
9.1 M     Trainable params
3.6 K     Non-trainable params
9.1 M     Total params
36.305    Total estimated model params size (MB)
Training: |                                                                                      | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting: |                                                                                    | 0/? [00:00<…
Out[20]:
Model sMAPE RMSE
0 NHiTS 115.131707 0.518734
1 TiDE 127.613580 0.600994
2 NBEATS 118.307593 0.588103
In [21]:
# Convert TimeSeries to pandas DataFrame for plotting
actual_scaled_df2 = target_scaled_series2.pd_dataframe()
predicted_df2 = predictions2['NHiTS'].pd_dataframe() #Chosen NHiTS as it has minm RMSE score and is the best model among rest.

# Create traces
fig = go.Figure()

# Add actual data trace
for col in actual_scaled_df2.columns:
    fig.add_trace(go.Scatter(x=actual_scaled_df2.index, y=actual_scaled_df2[col], mode='lines', name=f'Actual {col}'))

# Add predicted data trace
for col in predicted_df2.columns:
    fig.add_trace(go.Scatter(x=predicted_df2.index, y=predicted_df2[col], mode='lines', name=f'Predicted {col}'))

# Update layout
fig.update_layout(
    title="Actual vs Predicted Time Series Analysis-Raw",
    xaxis_title="Date",
    yaxis_title="Value",
    legend_title="Legend",
    width=1000,
    height=600
)

fig.show()

We get similar results here but post treatment, the errors are reduced so Outlier treatment and addressing multicollinaerity can play significant role.

Univariate Time Series Analysis. - Forecasting Unemployment without considering other features.¶

In [26]:
# Select relevant columns for the analysis
series = data['meanQ_Unemployment']

# Create a TimeSeries object
timeseries = TimeSeries.from_dataframe(data, value_cols='meanQ_Unemployment')

# Scale the data
scaler = Scaler()
scaled_series = scaler.fit_transform(timeseries)

# Split the data into train and test sets
train, test = scaled_series.split_before(0.8)

# Define models
models = {
    "NHiTS": NHiTSModel(input_chunk_length=48, output_chunk_length=36),
    "NBEATS": NBEATSModel(input_chunk_length=48, output_chunk_length=36),
    "TiDE": TiDEModel(input_chunk_length=48, output_chunk_length=36)
}

# DataFrame to store the results
results_df = pd.DataFrame(columns=['Models', 'sMAPE', 'RMSE'])

predictions = {}
# Loop over models
for model_name, model in models.items():
    # Fit the model
    model.fit(train)
    
    # Make predictions
    pred = model.predict(len(test))
    
    # Store predictions
    predictions[model_name] = pred
    
    # Calculate metrics
    model_smape = smape(test, pred)
    model_rmse = rmse(test, pred)
    
    # Append results to DataFrame
    results_df = results_df.append({
        'Models': model_name,
        'sMAPE': model_smape,
        'RMSE': model_rmse
    }, ignore_index=True)

results_df
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name            | Type             | Params | Mode 
-------------------------------------------------------------
0 | criterion       | MSELoss          | 0      | train
1 | train_criterion | MSELoss          | 0      | train
2 | val_criterion   | MSELoss          | 0      | train
3 | train_metrics   | MetricCollection | 0      | train
4 | val_metrics     | MetricCollection | 0      | train
5 | stacks          | ModuleList       | 877 K  | train
-------------------------------------------------------------
852 K     Trainable params
24.6 K    Non-trainable params
877 K     Total params
3.509     Total estimated model params size (MB)
Training: |                                                                                      | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting: |                                                                                    | 0/? [00:00<…
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name            | Type             | Params | Mode 
-------------------------------------------------------------
0 | criterion       | MSELoss          | 0      | train
1 | train_criterion | MSELoss          | 0      | train
2 | val_criterion   | MSELoss          | 0      | train
3 | train_metrics   | MetricCollection | 0      | train
4 | val_metrics     | MetricCollection | 0      | train
5 | stacks          | ModuleList       | 6.4 M  | train
-------------------------------------------------------------
6.4 M     Trainable params
1.6 K     Non-trainable params
6.4 M     Total params
25.559    Total estimated model params size (MB)
Training: |                                                                                      | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting: |                                                                                    | 0/? [00:00<…
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name             | Type             | Params | Mode 
--------------------------------------------------------------
0 | criterion        | MSELoss          | 0      | train
1 | train_criterion  | MSELoss          | 0      | train
2 | val_criterion    | MSELoss          | 0      | train
3 | train_metrics    | MetricCollection | 0      | train
4 | val_metrics      | MetricCollection | 0      | train
5 | encoders         | Sequential       | 29.1 K | train
6 | decoders         | Sequential       | 165 K  | train
7 | temporal_decoder | _ResidualBlock   | 594    | train
8 | lookback_skip    | Linear           | 1.8 K  | train
--------------------------------------------------------------
196 K     Trainable params
0         Non-trainable params
196 K     Total params
0.786     Total estimated model params size (MB)
Training: |                                                                                      | 0/? [00:00<…
`Trainer.fit` stopped: `max_epochs=100` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Predicting: |                                                                                    | 0/? [00:00<…
Out[26]:
Models sMAPE RMSE
0 NHiTS 135.284882 0.866495
1 NBEATS 129.268787 0.790406
2 TiDE 139.363406 1.106389
In [27]:
# Convert TimeSeries to pandas DataFrame for plotting
actual_scaled_df3 = scaled_series.pd_dataframe()
predicted_df3 = predictions['NHiTS'].pd_dataframe() #Chosen NHiTS as it has minm RMSE score and is the best model among rest.

# Create traces
fig = go.Figure()

# Add actual data trace
for col in actual_scaled_df3.columns:
    fig.add_trace(go.Scatter(x=actual_scaled_df3.index, y=actual_scaled_df3[col], mode='lines', name=f'Actual {col}'))

# Add predicted data trace
for col in predicted_df3.columns:
    fig.add_trace(go.Scatter(x=predicted_df3.index, y=predicted_df3[col], mode='lines', name=f'Predicted {col}'))

# Update layout
fig.update_layout(
    title="Actual vs Predicted Univariate Time Series Analysis",
    xaxis_title="Date",
    yaxis_title="Value",
    legend_title="Legend",
    width=1000,
    height=600
)

fig.show()

With the same parameters, tried univariate analysis, errors seem to be on higher end when compared to the multivariate one. Models work better when past_covariates are considered.

Insights based on above analysis¶

1) Any given time series data, 'date' format should be checked thouroughly incase there are different patterns either in the date format or date delimiter. 2) If there are multiple outliers across different columns, it should be treated. 3) If heatmap show multiple correlations among independent variables, then we should consider checking multicollinearity via VIF, if vif>10, and similar vif value shown between two variables, only one feature should be retained. 4) Model validation metrics shows errors are reduced when data is addressed with outlier treatment and multicollinearity. 5) very important - Experimentation with input_chunk_length and output_chunk_length parameters in the model.

a) n < output_chunk_length

b) When values of input_chunk_length and output_chunk_length are increased, model predict better compare to when less values of input_chunk_length and output_chunk_length in multivariate TSA and its vice versa in univariate TSA.
In [ ]: